Loading packages:

library(Seurat)
library(tidyverse)
library(gprofiler2)
library(sleepwalk)
library(SCINA)
library(gridExtra)
theme_set(theme_bw(base_size = 14))

#Loading data

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R")

data_dir_GoT <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R/AML1022.1_GoT'
data_dir_nanoranger <- '/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024/Processed data to be loaded in R/AML1022.1_nanoranger'


AML_GoT <- Read10X(
  data_dir_GoT,
  gene.column = 2,
  cell.column = 1,
  unique.features = TRUE,
  strip.suffix = FALSE
)

AML_nanoranger <- Read10X(
  data_dir_nanoranger,
  gene.column = 2,
  cell.column = 1,
  unique.features = TRUE,
  strip.suffix = FALSE
)
seurat_object_AML_GoT = CreateSeuratObject(AML_GoT)
seurat_object_AML_nanoranger = CreateSeuratObject(AML_nanoranger)
seurat_object_AML_GoT$orig.ident = 'AML_GoT'
seurat_object_AML_nanoranger$orig.ident = 'AML_nanoranger'


AML_merged <- merge(seurat_object_AML_GoT, y = seurat_object_AML_nanoranger, project = "AML_merged")
## Warning in CheckDuplicateCellNames(object.list = objects): Some cell names are
## duplicated across objects provided. Renaming to enforce unique cell names.
as_tibble(AML_merged@meta.data) %>%
  group_by(orig.ident) %>%
  summarise(initial_cells=n()) -> cell_counts

cell_counts
PercentageFeatureSet(AML_merged,pattern="^MT-") -> AML_merged$percent_mt
PercentageFeatureSet(AML_merged,pattern="MALAT1") -> AML_merged$percent_Malat
apply(AML_merged@assays$RNA@counts, 2, function(x) max((100*x)/sum(x))) -> AML_merged$percent_Largest

AML_merged[[]]
as_tibble(AML_merged@meta.data) -> qc_metrics
head(qc_metrics)
qc_metrics %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=value)) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot linear scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)

qc_metrics %>%
  pivot_longer(cols=-orig.ident, names_to="metric",values_to="value") %>%
  ggplot(aes(x=orig.ident, y=log(value))) +
  geom_violin(fill="lightskyblue")+
  facet_grid(rows=vars(metric), scales="free_y")

ggsave(paste0("Violin plot log scale no intercept "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 10, dpi = 300)
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")

NormalizeData(AML_merged, normalization.method = "CLR") -> AML_merged

saveRDS(AML_merged, file="AML_merged_norm_latest.RDS")
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")


subset <- AML_merged@assays$RNA@data[,c(rep(F,69),T)] %>%
  as_tibble() %>%
  pivot_longer(
    cols=everything(),
    names_to="cell",
    values_to="value"
  ) %>%
  left_join(
    as_tibble(AML_merged@meta.data, rownames="cell") %>% 
      dplyr::select(cell,orig.ident)
  ) 

subset %>%
  ggplot(aes(x=value, colour=orig.ident, group=cell)) +
  geom_density(linewidth=0.25)+
  coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
  scale_colour_brewer(palette = "Set1")

ggsave(paste0("Post norm check all samples together "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)

subset %>%
  ggplot(aes(x=value, colour=orig.ident, group=cell)) +
  geom_density(linewidth=0.25)+
  coord_cartesian(xlim=c(0,3), ylim=c(0,1)) +
  scale_colour_brewer(palette = "Set1") +
  facet_wrap(vars(orig.ident))

ggsave(paste0("Post norm check 4 samples separate "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
FindVariableFeatures(
  AML_merged,
  selection.method = "vst",
  nfeatures = 500
) -> AML_merged

as_tibble(HVFInfo(AML_merged),rownames = "Gene") -> variance.data

variance.data %>% 
  mutate(hypervariable=Gene %in% VariableFeatures(AML_merged)
) -> variance.data

variance.data %>%
  arrange(desc(variance.standardized)) -> variance.data

head(variance.data,n=100)

Plot variable genes

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")

variance.data %>%
  arrange(hypervariable) %>%
  ggplot(aes(x=mean,y=variance, colour=hypervariable)) +
  geom_point() +
  scale_y_log10() +
  scale_x_log10() +
  scale_colour_manual(values=c("grey","red2"))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis

ggsave(paste0("HVG gene plot "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous x-axis
#Extract gene list for GO analysis
#https://www.bioinformatics.babraham.ac.uk/goliath/goliath.cgi

variance.data %>%
  arrange(desc(variance.standardized)) %>%
  slice(1:100) %>%
  pull(Gene) %>%
  sapply(function(x)cat(paste0(x,"\n"))) -> temp
## CA1
## DLK1
## AHSP
## HBB
## PRTN3
## HBD
## HBM
## PPBP
## MPO
## HBG2
## ALAS2
## CA2
## IGLV1-44
## HBA2
## TRBV7-2
## PF4
## GNLY
## HBA1
## APOC1
## THBS1
## SLC4A1
## HIST1H4C
## PRDX2
## IGHV4-34
## AZU1
## SLC25A37
## HIST1H1B
## MS4A2
## TRBV5-1
## CLEC1B
## IFI27
## GP9
## TRBV7-3
## PRSS2
## TRBV7-9
## CCL4
## SNCA
## TRBV3-1
## ZNF385D
## TRBV10-3
## TRBV12-3
## TRBV11-2
## GDF15
## IFIT1B
## CTSG
## XACT
## GAL
## TRBV24-1
## CD1E
## TRBV6-2
## TRBV6-5
## PTGDS
## HDC
## CD9
## TRBV6-6
## JCHAIN
## TRBV9
## TRBV18
## TSPO2
## TRBV28
## TRBV30
## XCL1
## ITGB3
## TRBV13
## HBG1
## LTBP1
## TRBV14
## FCER1A
## TUBB1
## TRBV4-1
## TRBV7-6
## TRBV5-4
## RRM2
## TRBV29-1
## TRBV2
## MKI67
## GZMB
## IGHA1
## TRBV5-6
## ITGA2B
## TRDV2
## HIST1H3B
## ACY3
## ANK1
## MPIG6B
## GYPB
## TRBV4-2
## TRBV19
## LTB
## IL32
## DNASE1L3
## TRAV21
## IFIT2
## TRAV13-1
## ESAM
## IGLC2
## PRF1
## TRAV38-2DV8
## TRBV6-1
## KIAA1671

Dim red with PCA using var genes

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")

ScaleData(AML_merged, features=rownames(data)) -> AML_merged
## Centering and scaling data matrix
RunPCA(
  AML_merged,
  features=VariableFeatures(AML_merged)
) -> AML_merged
## PC_ 1 
## Positive:  TUBB, HMGA1, STMN1, PCLAF, TYMS, MIF, TUBA1B, DUT, MCM7, NME1 
##     HIST1H4C, MPC2, H2AFZ, PRSS57, CENPU, IGLL1, GATA1, FABP5, KLF1, PCNA 
##     PDLIM1, FEN1, MIR924HG, MYC, MINPP1, MRPL16, DTL, KCNH2, REXO2, PKMYT1 
## Negative:  S100A9, TYROBP, S100A8, VCAN, FCN1, IFI30, CD14, SERPINA1, MNDA, NCF1 
##     LYZ, TMEM176B, CEBPD, PLXDC2, LGALS2, S100A12, TMEM176A, CST3, FPR1, CFD 
##     RETN, S100P, IL1R2, PID1, PLAUR, JUN, CDKN1A, TIMP1, MAFB, CCL3 
## PC_ 2 
## Positive:  IL32, CD3E, CD247, CD7, GZMA, LTB, IL7R, PRF1, CD69, KLRB1 
##     GNLY, CST7, GZMB, KLRD1, GZMH, FGFBP2, CLIC3, PRSS2, CD27, CD8A 
##     GZMK, CCL5, FCGR3A, IGHM, NKG7, NKAIN2, AQP3, SPON2, KLRC1, CD8B 
## Negative:  LYZ, IFI30, CST3, FCN1, VCAN, S100A9, S100A8, SERPINA1, MNDA, CD14 
##     CEBPD, TYROBP, CFD, TMEM176B, LGALS2, NCF1, BLVRB, TIMP1, PLXDC2, TMEM176A 
##     S100A12, CDKN1A, RETN, CD36, FPR1, PLAUR, S100P, NRGN, IL1R2, TEX14 
## PC_ 3 
## Positive:  C1QTNF4, STMN1, MIF, HLA-DPB1, PRSS57, HLA-DPA1, NKAIN2, FABP5, IGHM, IGLL1 
##     NME1, H2AFZ, CD38, AL589693.1, MZB1, HMGA1, TUBA1B, DUT, MRPL16, TUBB 
##     HLA-DQA1, HLA-DQA2, WDR49, MIR924HG, CD69, AC002454.1, SLC24A3, MPO, MCM7, TXNDC5 
## Negative:  ALAS2, HBA1, AHSP, HBA2, CA1, HBB, HBM, SELENBP1, SLC25A37, SLC4A1 
##     SNCA, EPB42, GYPB, HBD, CA2, HBQ1, FECH, SLC25A39, GMPR, PRDX2 
##     PHOSPHO1, IFI27, TSPO2, ANK1, TNS1, MYL4, RHD, SMIM1, IFIT1B, DCAF12 
## PC_ 4 
## Positive:  C1QTNF4, NKAIN2, IGHM, PRSS2, SCN3A, H1F0, AL589693.1, MZB1, HEMGN, JCHAIN 
##     ACY3, HLA-DPB1, GNG11, TRIM58, HLA-DPA1, COL24A1, HBA2, WDR49, PRSS57, MMRN1 
##     ALAS2, HBA1, SLC24A3, SNCA, HBB, STMN1, HLA-DQA1, HLA-DQA2, SLC25A37, ADGRB3 
## Negative:  CST7, GZMA, GNLY, PRF1, CD247, CD7, GZMB, NKG7, KLRD1, CCL5 
##     FGFBP2, FCGR3A, GZMH, KLRB1, CLIC3, CCL4, IL32, KLRC1, SPON2, CD3E 
##     XCL2, LAT, JUN, DUSP2, CCL4L2, ZNF331, GZMK, CRIP1, CD8A, IL7R 
## PC_ 5 
## Positive:  ZNF385D, GATA1, CSF2RB, SEMA3C, MYO16, CYP7B1, GATA2, TRIB2, SLC40A1, CSF1 
##     ANGPT2, PDLIM1, KCNH2, LINC02284, FCER1A, GP9, HDC, TAFA2, KLF1, RBPMS2 
##     MS4A2, MS4A4E, SLC24A3, AC244502.1, PRKAR2B, SMIM1, ANKRD9, DLK1, MPIG6B, PF4 
## Negative:  C1QTNF4, AL589693.1, IGHM, MZB1, HLA-DPA1, HLA-DPB1, WDR49, MPO, NKAIN2, SCN3A 
##     NKG7, CTSG, DDIT4, HLA-DQA1, ACY3, HLA-DQA2, JCHAIN, H1F0, SLC25A39, AZU1 
##     TUBA1B, ID3, CD69, PRSS2, ID1, AHSP, ALAS2, HEMGN, DUSP2, MIR924HG
PCAPlot(AML_merged, dims=c(1,2), group.by= 'orig.ident')

ggsave(paste0("PCA dim 1&2 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(AML_merged, dims=c(3,4), group.by= 'orig.ident')

ggsave(paste0("PCA dim 3&4 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(AML_merged, dims=c(5,6), group.by= 'orig.ident')

ggsave(paste0("PCA dim 5&6 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(AML_merged, dims=c(7,8), group.by= 'orig.ident')

ggsave(paste0("PCA dim 7&8 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


PCAPlot(AML_merged, dims=c(9,10), group.by= 'orig.ident')

ggsave(paste0("PCA dim 9&10 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 6, dpi = 300)


ElbowPlot(AML_merged, ndims=40)

ggsave(paste0("PCA SD vs dim "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 5, dpi = 300)
Idents(AML_merged) <- 'orig.ident'
setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")

saved.seed <- 5540
set.seed(saved.seed)

RunUMAP(
  AML_merged,
  dims=1:30,
  n.neighbors = 50,
  min.dist = 1,
  seed.use = saved.seed
) -> AML_merged
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 14:39:13 UMAP embedding parameters a = 0.115 b = 1.929
## 14:39:13 Read 10119 rows and found 30 numeric columns
## 14:39:13 Using Annoy for neighbor search, n_neighbors = 50
## 14:39:13 Building Annoy index with metric = cosine, n_trees = 50
## 0%   10   20   30   40   50   60   70   80   90   100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 14:39:15 Writing NN index file to temp file /var/folders/rx/vvf3pzf55v912fz97g52nz3m0000gn/T//RtmpwV81Hi/file13853dcb38ce
## 14:39:15 Searching Annoy index using 1 thread, search_k = 5000
## 14:39:21 Annoy recall = 100%
## 14:39:23 Commencing smooth kNN distance calibration using 1 thread with target n_neighbors = 50
## 14:39:24 Initializing from normalized Laplacian + noise (using irlba)
## 14:39:25 Commencing optimization for 200 epochs, with 697754 positive edges
## 14:39:36 Optimization finished
DimPlot(AML_merged,reduction = "umap", pt.size = 0.3)

ggsave(paste0("UMAP all samples "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 8, dpi = 300)


DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, split.by = "orig.ident", ncol = 2)

ggsave(paste0("UMAP 3 samples separate "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 12, height = 10, dpi = 300)

Generating SNN Clusters

FindNeighbors(AML_merged, reduction = "pca", dims = 1:30) -> AML_merged
## Computing nearest neighbor graph
## Computing SNN
AML_merged@graphs$RNA_snn[1:10,1:10]
## 10 x 10 sparse Matrix of class "dgCMatrix"
##   [[ suppressing 10 column names 'AAACCTGAGAATTGTG-1_1', 'AAACCTGAGCTGATAA-1_1', 'AAACCTGAGGCAATTA-1_1' ... ]]
##                                         
## AAACCTGAGAATTGTG-1_1 1 . . . . . . . . .
## AAACCTGAGCTGATAA-1_1 . 1 . . . . . . . .
## AAACCTGAGGCAATTA-1_1 . . 1 . . . . . . .
## AAACCTGAGGTGCAAC-1_1 . . . 1 . . . . . .
## AAACCTGAGTGCAAGC-1_1 . . . . 1 . . . . .
## AAACCTGAGTGTACGG-1_1 . . . . . 1 . . . .
## AAACCTGCATCGGTTA-1_1 . . . . . . 1 . . .
## AAACCTGGTACCGCTG-1_1 . . . . . . . 1 . .
## AAACCTGGTCTCCATC-1_1 . . . . . . . . 1 .
## AAACCTGGTTCACGGC-1_1 . . . . . . . . . 1
FindClusters(AML_merged, resolution = 0.1) -> AML_merged
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10119
## Number of edges: 379815
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9661
## Number of communities: 8
## Elapsed time: 1 seconds
head(AML_merged$seurat_clusters, n=50)
## AAACCTGAGAATTGTG-1_1 AAACCTGAGCTGATAA-1_1 AAACCTGAGGCAATTA-1_1 
##                    7                    1                    1 
## AAACCTGAGGTGCAAC-1_1 AAACCTGAGTGCAAGC-1_1 AAACCTGAGTGTACGG-1_1 
##                    2                    0                    2 
## AAACCTGCATCGGTTA-1_1 AAACCTGGTACCGCTG-1_1 AAACCTGGTCTCCATC-1_1 
##                    1                    0                    0 
## AAACCTGGTTCACGGC-1_1 AAACCTGTCACGCATA-1_1 AAACGGGAGAGCTTCT-1_1 
##                    3                    1                    4 
## AAACGGGAGAGTTGGC-1_1 AAACGGGAGCGTAATA-1_1 AAACGGGAGGCTATCT-1_1 
##                    1                    3                    2 
## AAACGGGGTACCGTTA-1_1 AAACGGGGTCACAAGG-1_1 AAACGGGGTCTCTTTA-1_1 
##                    0                    3                    2 
## AAACGGGGTGTGACGA-1_1 AAACGGGTCACATAGC-1_1 AAACGGGTCAGCAACT-1_1 
##                    4                    0                    0 
## AAACGGGTCCTGTAGA-1_1 AAACGGGTCGATAGAA-1_1 AAAGATGAGACTTGAA-1_1 
##                    3                    0                    5 
## AAAGATGAGAGACTTA-1_1 AAAGATGAGCGTTTAC-1_1 AAAGATGAGGAATCGC-1_1 
##                    0                    0                    0 
## AAAGATGAGGACGAAA-1_1 AAAGATGAGTTACCCA-1_1 AAAGATGCACCCATGG-1_1 
##                    0                    4                    0 
## AAAGATGGTGATAAGT-1_1 AAAGATGGTGATGTGG-1_1 AAAGATGGTGGTGTAG-1_1 
##                    1                    1                    1 
## AAAGATGGTGTTCGAT-1_1 AAAGATGTCTCGATGA-1_1 AAAGCAAAGACGCACA-1_1 
##                    3                    0                    3 
## AAAGCAAAGATAGGAG-1_1 AAAGCAAAGGAGCGTT-1_1 AAAGCAAAGTAAGTAC-1_1 
##                    7                    0                    1 
## AAAGCAAAGTCCATAC-1_1 AAAGCAACACAGACAG-1_1 AAAGCAACATTCCTCG-1_1 
##                    0                    3                    3 
## AAAGCAAGTCATATCG-1_1 AAAGCAAGTGTGTGCC-1_1 AAAGCAATCAATCACG-1_1 
##                    1                    2                    1 
## AAAGCAATCACAACGT-1_1 AAAGCAATCTACTTAC-1_1 AAAGTAGAGCGTCTAT-1_1 
##                    1                    2                    5 
## AAAGTAGAGGGTTCCC-1_1 AAAGTAGCAATCCGAT-1_1 
##                    1                    3 
## Levels: 0 1 2 3 4 5 6 7

Plot clusters on UMAP

setwd("/Users/gexin/Library/CloudStorage/OneDrive-UniversityofCambridge/CamBioHackathon_2024")

DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 8)

ggsave(paste0("UMAP 2 samples with SNN clusters resolution 0.1 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 8, height = 8, dpi = 300)

DimPlot(AML_merged,reduction = "umap", pt.size = 0.3, label = TRUE, label.size = 4, split.by = "orig.ident", ncol = 2) 

ggsave(paste0("UMAP 2 samples separate with SNN clusters resolution 0.1 "
              , format(Sys.time(), "%Y-%m-%d")
              , ".pdf"), width = 12, height = 10, dpi = 300)